林嶔 (Lin, Chin)
Lesson 12
經過上週的介紹以及實作之後,你應該已經大致了解了YOLO v3的結構了,而今天我們的實驗將從YOLO v1的訓練來做起。
我們已經知道YOLO model在網路的前面與一般的圖像分類模型是一樣的了,重點是輸出的結構不同,假定一張224×224×3的圖像經過一系列的卷積操作之後,接著剩下7×7×n的特徵圖之後…
– 對於圖像分類模型,大多是透過Pooling把這個特徵圖縮減成1×1×n的特徵,並對其做Softmax regression的輸出
– 對於YOLO model,他是對每一個Gird都要有一系列輸出,因此他就是不做Pooling直接再用1×1的卷積核進行運算,從而輸出7×7×m的輸出,最終我們再對7×7×m的部分做解碼(Decode)。
– 所以假設最終的特徵圖大小為7×7×n,那在皮卡丘識別任務中YOLO結構的輸出將會是7×7×6。
可信度:這是一個必須介於0至1的數值,所以需要經過Sigmoid轉換後方能輸出
y座標(row)的「相對」位置:這也是一個必須介於0至1的數值
x座標(column)的「相對」位置:這也是一個必須介於0至1的數值
寬度(x軸):這是一個必須大於0的數值,經過指數轉換可以把任意數轉換成符合需求,但常規的做法是把原始值經過對數轉換,而輸出值是不做任何處理的
高度(y軸):這個部分與寬度相同
類別1的可能性:在YOLO v1中,是將類別1至類別N的可能性一起做Softmax,但在YOLO v3中將這個部分全部改成Sigmoid輸出,以允許多重標籤的物件
為什麼要使用「相對」位置而非「絕對」位置?
為什麼在高度/寬度的輸出不是使用ReLU或是指數轉換,而是將原始值做對數處理後而輸出值保持原樣?
– 讓我們先從這裡下載一個做圖像識別的MobileNet v2模型,我們先試試它的圖像分類效果:
library(mxnet)
library(imager)
library(jpeg)
library(OpenImageR)
library(magrittr)
#Load a pre-training residual network model
mobile_model <- mx.model.load("model/mobilev2", 0)
label_names <- readLines("model/synset.txt", encoding = "UTF-8")
#Define image processing functions
preproc.image <- function(im, width = 224, height = 224, method = 'bilinear') {
resized <- resizeImage(image = im, width = width, height = height, method = method)
resized <- as.array(resized) * 255
resized[,,1] <- resized[,,1] - 123.68
resized[,,2] <- resized[,,2] - 116.78
resized[,,3] <- resized[,,3] - 103.94
# Reshape to format needed by mxnet (width, height, channel, num)
dim(resized) <- c(width, height, 3, 1)
return(resized)
}
#Read image # Display image
img <- readJPEG("image/4.jpg")
#Pre-processing
normed <- preproc.image(img)
#Display image
par(mar = rep(0, 4))
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage(img, 0, 0, 1, 1, interpolate = FALSE)
#Predict
prob <- predict(mobile_model, X = normed, ctx = mx.cpu())
cat(paste0(label_names[which.max(prob)], ': ', formatC(max(prob), 4, format = 'f'), '\n'))
## n02497673 Madagascar cat, ring-tailed lemur, Lemur catta: 1.0000
– 這裡的函數「DWCONV_function」以及「CONV_function」都只是在原先的基礎上再增加卷積層,關鍵是函數「YOLO_map_function」的部分。
– 根據剛剛的定義你會發現除了高度/寬度的輸出(第4項與第5項)不需要經過Sigmoid轉換之外,剩下都需要,所以我們先用函數「mx.symbol.SliceChannel」把他們拆開,最後再各自處理過後再用函數「mx.symbol.concat」合併。
# Libraries
library(mxnet)
library(magrittr)
## Define the model architecture
## Use pre-trained model and fine tuning
# Load MobileNet v2
Pre_Trained_model <- mx.model.load('model/mobilev2', 0)
# Get the internal output
Mobile_symbol <- Pre_Trained_model$symbol
Mobile_All_layer <- Mobile_symbol$get.internals()
basic_out <- which(Mobile_All_layer$outputs == 'conv6_3_linear_bn_output') %>% Mobile_All_layer$get.output()
# mx.symbol.infer.shape(basic_out, data = c(256, 256, 3, 7))$out.shapes
# conv6_3_linear_bn_output out shape = 8 8 320 n (if input shape = 256 256 3 n)
# Convolution layer for specific mission and training new parameters
# 1. Additional some architecture for better learning
DWCONV_function <- function (indata, num_filters = 256, Inverse_coef = 6, residual = TRUE, name = 'lvl1', stage = 1) {
expend_conv <- mx.symbol.Convolution(data = indata, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = TRUE, num.filter = num_filters * Inverse_coef,
name = paste0(name, '_', stage, '_expend'))
expend_bn <- mx.symbol.BatchNorm(data = expend_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_expend_bn'))
expend_relu <- mx.symbol.LeakyReLU(data = expend_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_expend_relu'))
dwise_conv <- mx.symbol.Convolution(data = expend_relu, kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
no.bias = TRUE, num.filter = num_filters * Inverse_coef, num.group = num_filters * Inverse_coef,
name = paste0(name, '_', stage, '_dwise'))
dwise_bn <- mx.symbol.BatchNorm(data = dwise_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_dwise_bn'))
dwise_relu <- mx.symbol.LeakyReLU(data = dwise_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_dwise_relu'))
restore_conv <- mx.symbol.Convolution(data = dwise_relu, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = TRUE, num.filter = num_filters,
name = paste0(name, '_', stage, '_restore'))
restore_bn <- mx.symbol.BatchNorm(data = restore_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_restore_bn'))
if (residual) {
block <- mx.symbol.broadcast_plus(lhs = indata, rhs = restore_bn, name = paste0(name, '_', stage, '_block'))
return(block)
} else {
restore_relu <- mx.symbol.LeakyReLU(data = restore_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_restore_relu'))
return(restore_relu)
}
}
CONV_function <- function (indata, num_filters = 256, name = 'lvl1', stage = 1) {
conv <- mx.symbol.Convolution(data = indata, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = TRUE, num.filter = num_filters,
name = paste0(name, '_', stage, '_conv'))
bn <- mx.symbol.BatchNorm(data = conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_bn'))
relu <- mx.symbol.Activation(data = bn, act.type = 'relu', name = paste0(name, '_', stage, '_relu'))
return(relu)
}
YOLO_map_function <- function (indata, final_map = 6, num_box = 1, drop = 0.2, name = 'lvl1') {
dp <- mx.symbol.Dropout(data = indata, p = drop, name = paste0(name, '_drop'))
conv <- mx.symbol.Convolution(data = dp, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = FALSE, num.filter = final_map, name = paste0(name, '_linearmap'))
inter_split <- mx.symbol.SliceChannel(data = conv, num_outputs = final_map,
axis = 1, squeeze_axis = FALSE, name = paste0(name, "_inter_split"))
new_list <- list()
for (k in 1:final_map) {
if (!(k %% num_box) %in% c(4:5)) {
new_list[[k]] <- mx.symbol.Activation(inter_split[[k]], act.type = 'sigmoid', name = paste0(name, "_yolomap_", k))
} else {
new_list[[k]] <- inter_split[[k]]
}
}
yolomap <- mx.symbol.concat(data = new_list, num.args = final_map, dim = 1, name = paste0(name, "_yolomap"))
return(yolomap)
}
yolo_conv_1 <- DWCONV_function(indata = basic_out, num_filters = 320, Inverse_coef = 3, residual = TRUE, name = 'yolo', stage = 1)
yolo_conv_2 <- DWCONV_function(indata = yolo_conv_1, num_filters = 320, Inverse_coef = 3, residual = TRUE, name = 'yolo', stage = 2)
yolo_conv_3 <- CONV_function(indata = yolo_conv_2, num_filters = 320, name = 'yolo', stage = 3)
yolomap <- YOLO_map_function(indata = yolo_conv_3, final_map = 6, drop = 0.2, name = 'final')
第一個部分是對於y座標與x座標的損失
第二個部分是對於寬度與高度的損失
第三個部分是可信度該找出而答錯的損失
第四個部分是可信度該略過而答錯的損失
第五個部分是類別n的可能性的損失
– 另外,他還有個\(\lambda_{coord}\)以及\(\lambda_{noobj}\)兩個參數,根據YOLO v1 paper的建議分別被定是5以及0.5,這是因為物件識別是一個極度類別不平衡的任務,所以給予正向樣本較高的權重。
– 當然我們對y座標與x座標的部分是沒有辦法做修正的。
# 2. Custom loss function
MSE_loss_function <- function (indata, inlabel, obj, lambda) {
diff_pred_label <- mx.symbol.broadcast_minus(lhs = indata, rhs = inlabel)
square_diff_pred_label <- mx.symbol.square(data = diff_pred_label)
obj_square_diff_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = square_diff_pred_label)
MSE_loss <- mx.symbol.mean(data = obj_square_diff_loss, axis = 0:3, keepdims = FALSE)
return(MSE_loss * lambda)
}
CE_loss_function <- function (indata, inlabel, obj, lambda, eps = 1e-4) {
log_pred_1 <- mx.symbol.log(data = indata + eps)
log_pred_2 <- mx.symbol.log(data = 1 - indata + eps)
multiple_log_pred_label_1 <- mx.symbol.broadcast_mul(lhs = log_pred_1, rhs = inlabel)
multiple_log_pred_label_2 <- mx.symbol.broadcast_mul(lhs = log_pred_2, rhs = 1 - inlabel)
obj_weighted_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = multiple_log_pred_label_1 + multiple_log_pred_label_2)
average_CE_loss <- mx.symbol.mean(data = obj_weighted_loss, axis = 0:3, keepdims = FALSE)
CE_loss <- 0 - average_CE_loss * lambda
return(CE_loss)
}
YOLO_loss_function <- function (indata, inlabel, final_map = 6, num_box = 1, lambda = 10, weight_classification = 0.2, name = 'yolo') {
num_feature <- final_map/num_box
my_loss <- 0
yolomap_split <- mx.symbol.SliceChannel(data = indata, num_outputs = final_map,
axis = 1, squeeze_axis = FALSE, name = paste(name, '_yolomap_split'))
label_split <- mx.symbol.SliceChannel(data = inlabel, num_outputs = final_map,
axis = 1, squeeze_axis = FALSE, name = paste(name, '_label_split'))
for (j in 1:num_box) {
for (k in 1:num_feature) {
if (k %in% 1:5) {weight <- 1} else {weight <- weight_classification}
if (!k %in% c(2:5)) {
if (k == 1) {
my_loss <- my_loss + CE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = label_split[[(j-1)*num_feature+1]],
lambda = lambda * weight,
eps = 1e-4)
my_loss <- my_loss + CE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = 1 - label_split[[(j-1)*num_feature+1]],
lambda = 1,
eps = 1e-4)
} else {
my_loss <- my_loss + CE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = label_split[[(j-1)*num_feature+1]],
lambda = lambda * weight,
eps = 1e-4)
}
} else {
my_loss <- my_loss + MSE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = label_split[[(j-1)*num_feature+1]],
lambda = lambda * weight)
}
}
}
return(my_loss)
}
label <- mx.symbol.Variable(name = "label")
yolo_loss <- YOLO_loss_function(indata = yolomap, inlabel = label, final_map = 6, num_box = 1, lambda = 10, weight_classification = 0.2, name = 'yolo')
final_yolo_loss <- mx.symbol.MakeLoss(data = yolo_loss)
– 先讓我們從這裡下載所需要的檔案
– 如果你想弄懂怎樣從JPG檔案變成我們現在需要的格式,請你參考MxNetR-YOLO/pikachu/code/1. Processing data的過程
# Libraries
library(OpenImageR)
library(jpeg)
library(mxnet)
library(imager)
# Load data (Training set)
load('data/train_img_list.RData')
load('data/train_box_info.RData')
head(train_box_info)
## obj_name col_left col_right row_bot row_top prob img_id
## 1 pikachu 0.6267570 0.7256063 0.4658268 0.3013253 1 1
## 2 pikachu 0.5070340 0.5993253 0.4963081 0.3682864 1 2
## 3 pikachu 0.5904536 0.6917713 0.5608004 0.3917792 1 3
## 4 pikachu 0.5722729 0.6571676 0.5396996 0.4144326 1 4
## 5 pikachu 0.3893552 0.5016431 0.4850163 0.3470082 1 5
## 6 pikachu 0.3819232 0.4916472 0.5595707 0.4213461 1 6
head(train_img_list[[1]], 20)
## [1] ff d8 ff e0 00 10 4a 46 49 46 00 01 01 00 00 01 00 01 00 00
Show_img <- function (img, box_info = NULL, show_prob = FALSE, col_bbox = '#FFFFFF00', col_label = '#FF0000FF',
show_grid = FALSE, n.grid = 8, col_grid = '#0000FFFF') {
require(imager)
par(mar = rep(0, 4))
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.96, 0.04), xaxt = "n", yaxt = "n", bty = "n")
img <- (img - min(img))/(max(img) - min(img))
img <- as.raster(img)
rasterImage(img, 0, 1, 1, 0, interpolate=FALSE)
box_info[box_info[,2] < 0, 2] <- 0
box_info[box_info[,3] > 1, 3] <- 1
box_info[box_info[,4] > 1, 4] <- 1
box_info[box_info[,5] < 0, 5] <- 0
if (!is.null(box_info)) {
for (i in 1:nrow(box_info)) {
if (is.null(box_info$col[i])) {COL_LABEL <- col_label} else {COL_LABEL <- box_info$col[i]}
if (show_prob) {
TEXT <- paste0(box_info[i,1], ' (', formatC(box_info[i,6]*100, 0, format = 'f'), '%)')
} else {
TEXT <- box_info[i,1]
}
size <- max(box_info[i,3] - box_info[i,2], 0.05)
rect(xleft = box_info[i,2], xright = box_info[i,2] + 0.04*sqrt(size)*nchar(TEXT),
ybottom = box_info[i,5] + 0.08*sqrt(size), ytop = box_info[i,5],
col = COL_LABEL, border = COL_LABEL, lwd = 0)
text(x = box_info[i,2] + 0.02*sqrt(size) * nchar(TEXT),
y = box_info[i,5] + 0.04*sqrt(size),
labels = TEXT,
col = 'white', cex = 1.5*sqrt(size), font = 2)
rect(xleft = box_info[i,2], xright = box_info[i,3],
ybottom = box_info[i,4], ytop = box_info[i,5],
col = col_bbox, border = COL_LABEL, lwd = 5*sqrt(size))
}
}
if (show_grid) {
for (i in 1:n.grid) {
if (i != n.grid) {
abline(a = i/n.grid, b = 0, col = col_grid, lwd = 12/n.grid)
abline(v = i/n.grid, col = col_grid, lwd = 12/n.grid)
}
for (j in 1:n.grid) {
text((i-0.5)/n.grid, (j-0.5)/n.grid, paste0('(', j, ', ', i, ')'), col = col_grid, cex = 8/n.grid)
}
}
}
}
img_id <- 1
resized_img <- readJPEG(train_img_list[[img_id]])
sub_BOX_INFOS <- train_box_info[train_box_info$img_id %in% img_id,]
Show_img(img = resized_img, box_info = sub_BOX_INFOS, show_grid = FALSE)
– 這裡還需要一個函數「IoU_function」,因為在未來做輸出預測的時候很有可能會產生多個大範圍重疊的框框,所以我們需要用到非極大值抑制(Non-Maximum Suppression, NMS)來移除多餘的框:
# Custom function
# Note: this function made some efforts to keep the coordinate system consistent.
# The major challenge is that 'bottomleft' is the original point of "plot" function,
# but the original point of image is 'topleft'
IoU_function <- function (label, pred) {
overlap_width <- min(label[,2], pred[,2]) - max(label[,1], pred[,1])
overlap_height <- min(label[,3], pred[,3]) - max(label[,4], pred[,4])
if (overlap_width > 0 & overlap_height > 0) {
pred_size <- (pred[,2]-pred[,1])*(pred[,3]-pred[,4])
label_size <- (label[,2]-label[,1])*(label[,3]-label[,4])
overlap_size <- overlap_width * overlap_height
return(overlap_size/(pred_size + label_size - overlap_size))
} else {
return(0)
}
}
Encode_fun <- function (box_info, n.grid = 8, eps = 1e-8, obj_name = 'pikachu') {
img_ids <- unique(box_info$img_id)
num_pred <- 5 + length(obj_name)
out_array <- array(0, dim = c(n.grid, n.grid, num_pred, length(img_ids)))
for (j in 1:length(img_ids)) {
sub_box_info <- box_info[box_info$img_id == img_ids[j],]
for (i in 1:nrow(sub_box_info)) {
bbox_center_row <- (sub_box_info[i,4] + sub_box_info[i,5]) / 2 * n.grid
bbox_center_col <- (sub_box_info[i,2] + sub_box_info[i,3]) / 2 * n.grid
bbox_width <- (sub_box_info[i,3] - sub_box_info[i,2]) * n.grid
bbox_height <- (sub_box_info[i,4] - sub_box_info[i,5]) * n.grid
center_row <- ceiling(bbox_center_row)
center_col <- ceiling(bbox_center_col)
row_related_pos <- bbox_center_row %% 1
row_related_pos[row_related_pos == 0] <- 1
col_related_pos <- bbox_center_col %% 1
col_related_pos[col_related_pos == 0] <- 1
out_array[center_row,center_col,1,j] <- 1
out_array[center_row,center_col,2,j] <- row_related_pos
out_array[center_row,center_col,3,j] <- col_related_pos
out_array[center_row,center_col,4,j] <- log(bbox_width + eps)
out_array[center_row,center_col,5,j] <- log(bbox_height + eps)
out_array[center_row,center_col,5+which(obj_name %in% sub_box_info$obj_name[i]),j] <- 1
}
}
return(out_array)
}
Decode_fun <- function (encode_array, cut_prob = 0.5, cut_overlap = 0.3,
obj_name = 'pikachu',
obj_col = '#FF0000FF',
img_id_list = NULL) {
num_img <- dim(encode_array)[4]
num_feature <- length(obj_name) + 5
pos_start <- (0:(dim(encode_array)[3]/num_feature-1)*num_feature)
box_info <- NULL
# Decoding
for (j in 1:num_img) {
sub_box_info <- NULL
for (i in 1:length(pos_start)) {
sub_encode_array <- as.array(encode_array)[,,pos_start[i]+1:num_feature,j]
pos_over_cut <- which(sub_encode_array[,,1] >= cut_prob)
if (length(pos_over_cut) >= 1) {
pos_over_cut_row <- pos_over_cut %% dim(sub_encode_array)[1]
pos_over_cut_row[pos_over_cut_row == 0] <- dim(sub_encode_array)[1]
pos_over_cut_col <- ceiling(pos_over_cut/dim(sub_encode_array)[1])
for (l in 1:length(pos_over_cut)) {
encode_vec <- sub_encode_array[pos_over_cut_row[l],pos_over_cut_col[l],]
if (encode_vec[2] < 0) {encode_vec[2] <- 0}
if (encode_vec[2] > 1) {encode_vec[2] <- 1}
if (encode_vec[3] < 0) {encode_vec[3] <- 0}
if (encode_vec[3] > 1) {encode_vec[3] <- 1}
center_row <- (encode_vec[2] + (pos_over_cut_row[l] - 1))/dim(sub_encode_array)[1]
center_col <- (encode_vec[3] + (pos_over_cut_col[l] - 1))/dim(sub_encode_array)[2]
width <- exp(encode_vec[4])/dim(sub_encode_array)[2]
height <- exp(encode_vec[5])/dim(sub_encode_array)[1]
if (is.null(img_id_list)) {new_img_id <- j} else {new_img_id <- img_id_list[j]}
new_box_info <- data.frame(obj_name = obj_name[which.max(encode_vec[-c(1:5)])],
col_left = center_col-width/2,
col_right = center_col+width/2,
row_bot = center_row+height/2,
row_top = center_row-height/2,
prob = encode_vec[1],
img_id = new_img_id,
col = obj_col[which.max(encode_vec[-c(1:5)])],
stringsAsFactors = FALSE)
sub_box_info <- rbind(sub_box_info, new_box_info)
}
}
}
if (!is.null(sub_box_info)) {
# Remove overlapping
sub_box_info <- sub_box_info[order(sub_box_info$prob, decreasing = TRUE),]
for (obj in unique(sub_box_info$obj_name)) {
obj_sub_box_info <- sub_box_info[sub_box_info$obj_name == obj,]
if (nrow(obj_sub_box_info) == 1) {
box_info <- rbind(box_info, obj_sub_box_info)
} else {
overlap_seq <- NULL
for (m in 2:nrow(obj_sub_box_info)) {
for (n in 1:(m-1)) {
if (!n %in% overlap_seq) {
overlap_prob <- IoU_function(label = obj_sub_box_info[m,2:5], pred = obj_sub_box_info[n,2:5])
overlap_width <- min(obj_sub_box_info[m,3], obj_sub_box_info[n,3]) - max(obj_sub_box_info[m,2], obj_sub_box_info[n,2])
overlap_height <- min(obj_sub_box_info[m,4], obj_sub_box_info[n,4]) - max(obj_sub_box_info[m,5], obj_sub_box_info[n,5])
if (overlap_prob >= cut_overlap) {
overlap_seq <- c(overlap_seq, m)
}
}
}
}
if (!is.null(overlap_seq)) {
obj_sub_box_info <- obj_sub_box_info[-overlap_seq,]
}
box_info <- rbind(box_info, obj_sub_box_info)
}
}
}
}
return(box_info)
}
# Test Encode & Decode function
img_id <- 1
resized_img <- readJPEG(train_img_list[[img_id]])
sub_BOX_INFOS <- train_box_info[train_box_info$img_id %in% img_id,]
Encode_label <- Encode_fun(box_info = sub_BOX_INFOS)
restore_BOX_INFOS <- Decode_fun(encode_array = Encode_label)
Show_img(img = resized_img, box_info = restore_BOX_INFOS, show_grid = TRUE)
Encode_label
## , , 1, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 1 0 0
## [5,] 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0
## [7,] 0 0 0 0 0 0 0 0
## [8,] 0 0 0 0 0 0 0 0
##
## , , 2, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0.00000000 0 0
## [2,] 0 0 0 0 0 0.00000000 0 0
## [3,] 0 0 0 0 0 0.00000000 0 0
## [4,] 0 0 0 0 0 0.06860864 0 0
## [5,] 0 0 0 0 0 0.00000000 0 0
## [6,] 0 0 0 0 0 0.00000000 0 0
## [7,] 0 0 0 0 0 0.00000000 0 0
## [8,] 0 0 0 0 0 0.00000000 0 0
##
## , , 3, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0.0000000 0 0
## [2,] 0 0 0 0 0 0.0000000 0 0
## [3,] 0 0 0 0 0 0.0000000 0 0
## [4,] 0 0 0 0 0 0.4094529 0 0
## [5,] 0 0 0 0 0 0.0000000 0 0
## [6,] 0 0 0 0 0 0.0000000 0 0
## [7,] 0 0 0 0 0 0.0000000 0 0
## [8,] 0 0 0 0 0 0.0000000 0 0
##
## , , 4, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0.0000000 0 0
## [2,] 0 0 0 0 0 0.0000000 0 0
## [3,] 0 0 0 0 0 0.0000000 0 0
## [4,] 0 0 0 0 0 -0.2347173 0 0
## [5,] 0 0 0 0 0 0.0000000 0 0
## [6,] 0 0 0 0 0 0.0000000 0 0
## [7,] 0 0 0 0 0 0.0000000 0 0
## [8,] 0 0 0 0 0 0.0000000 0 0
##
## , , 5, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0.0000000 0 0
## [2,] 0 0 0 0 0 0.0000000 0 0
## [3,] 0 0 0 0 0 0.0000000 0 0
## [4,] 0 0 0 0 0 0.2746061 0 0
## [5,] 0 0 0 0 0 0.0000000 0 0
## [6,] 0 0 0 0 0 0.0000000 0 0
## [7,] 0 0 0 0 0 0.0000000 0 0
## [8,] 0 0 0 0 0 0.0000000 0 0
##
## , , 6, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 1 0 0
## [5,] 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0
## [7,] 0 0 0 0 0 0 0 0
## [8,] 0 0 0 0 0 0 0 0
# Build an iterator
train_ids <- unique(train_box_info[,'img_id'])
my_iterator_core <- function (batch_size, img_size = 256, resize_method = 'bilinear',
aug_crop = TRUE, aug_flip = TRUE) {
batch <- 0
batch_per_epoch <- floor(length(train_ids)/batch_size)
reset <- function() {batch <<- 0}
iter.next <- function() {
batch <<- batch + 1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value <- function() {
idx <- 1:batch_size + (batch - 1) * batch_size
idx[idx > length(train_ids)] <- sample(1:(idx[1]-1), sum(idx > length(train_ids)))
idx <- sort(idx)
batch.box_info <- train_box_info[train_box_info$img_id %in% train_ids[idx],]
#t0 <- Sys.time()
img_array <- array(0, dim = c(img_size, img_size, 3, batch_size))
for (i in 1:batch_size) {
read_img <- readJPEG(train_img_list[[train_ids[idx[i]]]])
img_array[,,,i] <- preproc.image(read_img, width = img_size, height = img_size, method = resize_method)
}
if (aug_flip) {
original_dim <- dim(img_array)
if (sample(0:1, 1) == 1) {
img_array <- img_array[,original_dim[2]:1,,]
flip_left <- 1 - batch.box_info[,2]
flip_right <- 1 - batch.box_info[,3]
batch.box_info[,2] <- flip_right
batch.box_info[,3] <- flip_left
dim(img_array) <- original_dim
}
}
if (aug_crop) {
revised_dim <- dim(img_array)
revised_dim[1:2] <- img_size - 32
random.row <- sample(0:32, 1)
random.col <- sample(0:32, 1)
img_array <- img_array[random.row+1:(img_size-32),random.col+1:(img_size-32),,]
dim(img_array) <- revised_dim
batch.box_info[,4:5] <- batch.box_info[,4:5] * img_size / (img_size - 32) - random.row/256
batch.box_info[,2:3] <- batch.box_info[,2:3] * img_size / (img_size - 32) - random.col/256
for (j in 2:5) {
batch.box_info[batch.box_info[,j] <= 0,j] <- 0
batch.box_info[batch.box_info[,j] >= 1,j] <- 1
}
}
label <- Encode_fun(box_info = batch.box_info, n.grid = dim(img_array)[1]/32)
label <- mx.nd.array(label)
data <- mx.nd.array(img_array)
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size", "img_size", "resize_method", "aug_crop", "aug_flip"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 16, img_size = 256, resize_method = 'nearest',
aug_crop = TRUE, aug_flip = TRUE){
.self$iter <- my_iterator_core(batch_size = batch_size, img_size = img_size, resize_method = resize_method,
aug_crop = aug_crop, aug_flip = aug_flip)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
# Test iterator function
my_iter <- my_iterator_func(iter = NULL, batch_size = 16, img_size = 256, resize_method = 'bilinear',
aug_crop = TRUE, aug_flip = TRUE)
my_iter$reset()
my_iter$iter.next()
## [1] TRUE
test <- my_iter$value()
img_seq <- 1
iter_img <- as.array(test$data)[,,,img_seq]
iter_img[,,1] <- iter_img[,,1] + 123.68
iter_img[,,2] <- iter_img[,,2] + 116.78
iter_img[,,3] <- iter_img[,,3] + 103.94
iter_img <- iter_img / 255
iter_box_info <- Decode_fun(test$label)
Show_img(img = iter_img, box_info = iter_box_info[iter_box_info$img_id == img_seq,], show_grid = FALSE)
# Custom callback function
my.eval.metric.loss <- mx.metric.custom(
name = "multi_part_loss",
function(label, pred) {
return(as.array(pred))
}
)
my.callback_batch <- function (batch.size = 16, frequency = 10) {
function(iteration, nbatch, env, verbose = TRUE) {
count <- nbatch
if (is.null(env$count))
env$count <- 0
if (is.null(env$init))
env$init <- FALSE
if (env$count > count)
env$init <- FALSE
env$count = count
if (env$init) {
if (count%%frequency == 0 && !is.null(env$metric)) {
time <- as.double(difftime(Sys.time(), env$tic,
units = "secs"))
speed <- frequency * batch.size/time
result <- env$metric$get(env$train.metric)
if (nbatch != 0 & verbose) {
message(paste0("Batch [", nbatch, "] Speed: ",
formatC(speed, 3, format = "f"), " samples/sec Train-", result$name,
"=", as.array(result$value)))
}
env$tic = Sys.time()
}
}
else {
env$init <- TRUE
env$tic <- Sys.time()
}
}
}
my.callback_epoch <- function (out_symbol, logger = NULL,
prefix = 'model/yolo_v1',
fixed.params = NULL,
period = 1) {
function(iteration, nbatch, env, verbose = TRUE) {
if (iteration%%period == 0) {
env_model <- env$model
env_all_layers <- env_model$symbol$get.internals()
model_write_out <- list(symbol = out_symbol,
arg.params = env_model$arg.params,
aux.params = env_model$aux.params)
model_write_out[[2]] <- append(model_write_out[[2]], fixed.params)
class(model_write_out) <- "MXFeedForwardModel"
mx.model.save(model_write_out, prefix, iteration)
if (verbose) {
message(sprintf("Model checkpoint saved to %s-%04d.params", prefix, iteration))
}
}
if (!is.null(logger)) {
if (class(logger) != "mx.metric.logger") {
stop("Invalid mx.metric.logger.")
} else {
result <- env$metric$get(env$train.metric)
logger$train <- c(logger$train, result$value)
if (!is.null(env$eval.metric)) {
result <- env$metric$get(env$eval.metric)
logger$eval <- c(logger$eval, result$value)
}
}
}
return(TRUE)
}
}
# initiate Parameter for model
new_arg <- mxnet:::mx.model.init.params(symbol = final_yolo_loss,
input.shape = list(data = c(224, 224, 3, 13),
label = c(7, 7, 6, 13)),
output.shape = NULL, initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24),
ctx = mx.cpu())
# Bind Pre-trained Parameter into model
Pre_trained_ARG <- Pre_Trained_model$arg.params
ARG_in_net_name <- names(Pre_trained_ARG) %>% .[. %in% names(new_arg$arg.params)] # remove paramter does not in model
for (i in 1:length(ARG_in_net_name)){
new_arg$arg.params[names(new_arg$arg.params) == ARG_in_net_name[i]] <- Pre_trained_ARG[names(Pre_trained_ARG) == ARG_in_net_name[i]]
}
ARG.PARAMS <- new_arg$arg.params
# Model Training
my_logger <- mx.metric.logger$new()
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 5e-3, momentum = 0.9, wd = 1e-4)
my_iter <- my_iterator_func(iter = NULL, batch_size = 16, img_size = 256, aug_crop = TRUE, aug_flip = TRUE)
YOLO_model <- mx.model.FeedForward.create(final_yolo_loss, X = my_iter,
ctx = mx.cpu(), num.round = 100, optimizer = my_optimizer,
arg.params = ARG.PARAMS, eval.metric = my.eval.metric.loss,
input.names = 'data', output.names = 'label',
batch.end.callback = my.callback_batch(batch.size = 16, frequency = 10),
epoch.end.callback = my.callback_epoch(out_symbol = yolomap, logger = my_logger,
prefix = 'model/yolo_pikachu', period = 1))
親手訓練一個自己的模型是很重要的,讓我們重複一次整個過程,並且用你的模型預測Validation的圖像。
如果你將所有的程式碼複製貼上一次,理論上你應該會得到一個物件「YOLO_model」,你可以把它存下來並且做後續應用。
– 如果你因為電腦問題沒辦法很快的得到模型,你可以下載yolo_v1-symbol.json以及yolo_v1-0000.params下載已經訓練好的模型。
# Load valiation dataset
val_img_list_path <- 'data/val_img_list.RData'
val_box_info_path <- 'data/val_box_info.RData'
load(val_img_list_path)
load(val_box_info_path)
# Select an image
used_img_id <- 3
img <- readJPEG(val_img_list[[used_img_id]])
sub_BOX_INFOS <- val_box_info[val_box_info$img_id %in% used_img_id,]
# Show image
Show_img(img = img, box_info = sub_BOX_INFOS, show_grid = FALSE)
– 現在的問題是,我們該如何解碼這個輸出呢?
# Select an image
used_img_id <- 3
img <- readJPEG(val_img_list[[used_img_id]])
img_array <- preproc.image(img, width = 256, height = 256)
# Predict and decode
pred_out <- mxnet:::predict.MXFeedForwardModel(model = YOLO_model, X = img_array)
# Show output
pred_out
## , , 1, 1
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2.175886e-05 7.765517e-05 6.398771e-04 4.800425e-05 3.087417e-04
## [2,] 2.059357e-05 8.112825e-06 8.250023e-05 4.238817e-05 2.192027e-05
## [3,] 3.450739e-05 5.807074e-06 6.497771e-05 1.424192e-04 4.559512e-06
## [4,] 2.848043e-05 1.614074e-05 9.043092e-05 4.936542e-05 3.811426e-04
## [5,] 3.861512e-05 7.038127e-06 3.769622e-05 1.182349e-04 3.700055e-06
## [6,] 1.567435e-04 1.349305e-04 1.849353e-04 1.043047e-04 1.259342e-04
## [7,] 1.289619e-04 1.553132e-05 6.090637e-05 1.070057e-04 3.868345e-05
## [8,] 1.478953e-04 3.951438e-05 1.161170e-05 5.032596e-06 7.407457e-06
## [,6] [,7] [,8]
## [1,] 4.980038e-05 1.157811e-04 3.085980e-05
## [2,] 1.784436e-05 6.316273e-06 6.267606e-05
## [3,] 2.839258e-07 1.065451e-07 5.282246e-05
## [4,] 9.999599e-01 1.109684e-09 3.983207e-05
## [5,] 1.797017e-06 3.168526e-08 2.658951e-05
## [6,] 1.699532e-05 3.337686e-06 1.179599e-04
## [7,] 4.990354e-05 3.586038e-06 5.054910e-04
## [8,] 2.041740e-05 3.922387e-05 6.184297e-05
##
## , , 2, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.6801450 0.5976912 0.5578443 0.5918126 0.5708206 0.56628942
## [2,] 0.6017925 0.4811206 0.4763767 0.6140453 0.6175091 0.71936470
## [3,] 0.5250729 0.5215399 0.4980438 0.6516942 0.9127852 0.98351616
## [4,] 0.5178071 0.4388809 0.4853486 0.6659355 0.7007865 0.46315542
## [5,] 0.5521403 0.5104556 0.5306891 0.5617887 0.5484521 0.07533818
## [6,] 0.5054854 0.4036428 0.4691980 0.4881016 0.3890624 0.20025289
## [7,] 0.4830571 0.3368378 0.4328839 0.3259984 0.2888888 0.30536026
## [8,] 0.1950672 0.2086147 0.2983870 0.3268004 0.3251016 0.34801593
## [,7] [,8]
## [1,] 0.4895642 0.5102438
## [2,] 0.5594086 0.3605731
## [3,] 0.8632115 0.3051354
## [4,] 0.8210499 0.2974217
## [5,] 0.6029838 0.2879823
## [6,] 0.4206741 0.3367405
## [7,] 0.3760834 0.3530687
## [8,] 0.2279852 0.3351136
##
## , , 3, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.4751273 0.4651938 0.4465239 0.4151169 0.4999950 0.4306733
## [2,] 0.4107683 0.3209447 0.3014812 0.3956809 0.5529618 0.4268115
## [3,] 0.3442642 0.2379502 0.2282802 0.4067807 0.8361494 0.5245005
## [4,] 0.3177827 0.2611143 0.1835159 0.4224148 0.9619181 0.4594024
## [5,] 0.3556828 0.2542045 0.1604596 0.3569542 0.9365183 0.5691842
## [6,] 0.3685289 0.2939848 0.1063866 0.1701014 0.4143588 0.3491405
## [7,] 0.4114659 0.4028378 0.1914930 0.1742721 0.2317697 0.2430892
## [8,] 0.3812750 0.4862524 0.3801619 0.3776010 0.4030150 0.4106047
## [,7] [,8]
## [1,] 0.353520840 0.29838821
## [2,] 0.194996879 0.26636934
## [3,] 0.018944228 0.08806755
## [4,] 0.002033974 0.06880085
## [5,] 0.067198150 0.11599731
## [6,] 0.206061244 0.24148135
## [7,] 0.413511306 0.35321954
## [8,] 0.501760602 0.47578865
##
## , , 4, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.16380896 0.18155675 0.28477421 0.26052853 0.14869145 1.746523e-01
## [2,] 0.09346113 0.07011465 0.28132915 0.20149416 0.10489836 7.165832e-02
## [3,] 0.06652214 0.04279037 0.23358104 0.16400860 0.02069804 3.327309e-03
## [4,] 0.07173117 0.03271503 0.19422546 0.15870336 0.01134628 2.795498e-06
## [5,] 0.06090595 0.02956775 0.17395532 0.16388598 0.07179106 3.275967e-02
## [6,] 0.05138716 0.01869932 0.10863931 0.12763382 0.05215968 6.555290e-02
## [7,] 0.04294629 0.01137724 0.03496335 0.05444485 0.06518067 9.568776e-02
## [8,] 0.16907576 0.10101945 0.11087536 0.11840142 0.12028946 1.815709e-01
## [,7] [,8]
## [1,] 0.322869450 0.33299318
## [2,] 0.152524248 0.33520162
## [3,] 0.003178379 0.15376464
## [4,] 0.003520291 0.14748868
## [5,] 0.004487852 0.14020464
## [6,] 0.074568249 0.16894498
## [7,] 0.053376798 0.08523209
## [8,] 0.202904403 0.25414819
##
## , , 5, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.4079272 0.3261908 0.31690216 0.3296756 0.2410627 0.30813831
## [2,] 0.3231982 0.1925583 0.23288627 0.2219387 0.1364148 0.13665222
## [3,] 0.3041542 0.1440943 0.14642000 0.1195453 0.1154995 0.11228056
## [4,] 0.3211254 0.1410730 0.12788023 0.1273584 0.2776253 0.07437739
## [5,] 0.2957854 0.1432805 0.11832014 0.1300656 0.3067705 0.37014121
## [6,] 0.2880217 0.1308005 0.09683795 0.1272122 0.1974431 0.17784469
## [7,] 0.2730801 0.2067374 0.21397069 0.2436326 0.3125082 0.27777615
## [8,] 0.3678777 0.4176787 0.36428547 0.4605475 0.4652155 0.49894223
## [,7] [,8]
## [1,] 0.2823304 0.3006729
## [2,] 0.2228489 0.3521521
## [3,] 0.2160688 0.2232081
## [4,] 0.2493887 0.2283885
## [5,] 0.3104072 0.1852986
## [6,] 0.2060298 0.3484747
## [7,] 0.2975561 0.3538238
## [8,] 0.4790375 0.3997004
##
## , , 6, 1
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.6715143 0.6999446 0.8284470 0.6146195 0.7441497 0.5818871 0.5682781
## [2,] 0.6660944 0.7690154 0.8895428 0.7087998 0.8185279 0.7829362 0.6707016
## [3,] 0.7812672 0.9148030 0.9607246 0.9199726 0.9097051 0.9738428 0.9504442
## [4,] 0.7853820 0.9341723 0.9638215 0.9004298 0.9767600 0.9999992 0.9458553
## [5,] 0.7923232 0.9272834 0.9464728 0.9402913 0.9519114 0.9891417 0.9584301
## [6,] 0.8093688 0.9569872 0.9716794 0.9559923 0.9816977 0.9860116 0.9243711
## [7,] 0.8482025 0.9808857 0.9699230 0.9536955 0.9580126 0.9475592 0.9488915
## [8,] 0.9244884 0.9356574 0.9094222 0.8698427 0.9022931 0.8695790 0.8862187
## [,8]
## [1,] 0.6327722
## [2,] 0.5652426
## [3,] 0.7620061
## [4,] 0.7947741
## [5,] 0.8452669
## [6,] 0.7523736
## [7,] 0.8143295
## [8,] 0.8567851
# Decode output
pred_box_info <- Decode_fun(pred_out, cut_prob = 0.5, cut_overlap = 0.3)
pred_box_info
## obj_name col_left col_right row_bot row_top prob img_id
## 1 pikachu 0.6199251 0.7449255 0.5002203 0.3655686 0.9999599 1
## col
## 1 #FF0000FF
# Show image
Show_img(img = img, box_info = pred_box_info, show_prob = TRUE, show_grid = FALSE)
– 但這些比賽通常同時有多種物件同時需要識別,所以比賽的指標一般來說都使用「mean Average Precision (mAP)」,而這也是相關Paper使用的模型評估指標。
我們先從「Precision」開始理解起,它的定義其實就是醫學上常用的「Positive Predictive Value」。
而所謂的「Average Precision」就是指說,在每一個不同的「Recall」之下,所有Precision的平均值。
– 「Recall」就是醫學上常用的「Sensitivity」
– 根據他的定義,其實就是畫一條「Precision x Recall curve」,並計算它的曲線下面積:
# Sample information
num_obj <- 4
pred_value <- c(0.93, 0.75, 0.67, 0.71, 0.82, 0.91)
real_value <- c(1, 1, 0, 1, 0, 0)
# Calculation process
real_value <- real_value[order(pred_value, decreasing=TRUE)]
cum_TP <- cumsum(real_value)
P_list <- cum_TP * real_value / seq_along(real_value)
P_list <- P_list[P_list!=0]
while (sum(diff(P_list) > 0) >= 1) {
diff_P_list <- diff(P_list)
diff_P_list[diff_P_list < 0] <- 0
P_list <- P_list + c(diff_P_list, 0)
}
# Average Precision
sum(P_list)/num_obj
## [1] 0.55
IoU_function(sub_BOX_INFOS[,2:5], pred_box_info[,2:5])
## [1] 0.418427
# Sample information
num_obj <- 4
pred_value <- c(0.93, 0.75, 0.67, 0.71, 0.82, 0.91)
real_IoU <- c(0.75, 0.81, 0.42, 0.69, 0.27, 0.39)
# Calculation function
AP_function <- function (obj_IoU, obj_prob, num_obj, IoU_cut = 0.5) {
sort_obj_IoU <- obj_IoU[order(obj_prob, decreasing=TRUE)]
pred_postive <- sort_obj_IoU > IoU_cut
cum_TP <- cumsum(pred_postive)
P_list <- cum_TP * pred_postive / seq_along(pred_postive)
P_list <- P_list[P_list!=0]
while (sum(diff(P_list) > 0) >= 1) {
diff_P_list <- diff(P_list)
diff_P_list[diff_P_list < 0] <- 0
P_list <- P_list + c(diff_P_list, 0)
}
return(sum(P_list)/num_obj)
}
# Show AP
AP_function(obj_IoU = real_IoU, obj_prob = pred_value, num_obj = num_obj)
## [1] 0.55
– 比較有意思的是,剛剛的訓練過程中你應該有把每一代的模型都儲存下來了,你是否能稍微找一下訓練到第幾代就差不多了,不要過度訓練以防overfitting?
– 當然,你也可以試著調整訓練中所使用的參數,舉例來說…
在我們的Loss function中因為嚴重的類別不平衡,我們給定\(\lambda = 10\)以及\(\mbox{weight_classification} = 0.2\),你可以調整看看是不是會更好
在預測的時候,我們移除多餘預測框使用了\(\mbox{cut_overlap} = 0.3\),降低這個值會減少多餘的框,這會不會增加Average Precision呢
換一個起始模型進行轉移特徵學習,舉例來說改成ResNet,並且你可以修正它後面所連接的結構
有一堆額外的超參數可以讓你調整,像是Batch size、L2正則化的強度、學習率等
# Load model
YOLO_model <- mx.model.load('model/yolo_v1', 0)
# Load valiation dataset
val_img_list_path <- 'data/val_img_list.RData'
val_box_info_path <- 'data/val_box_info.RData'
load(val_img_list_path)
load(val_box_info_path)
# Read images
img_array <- array(0, dim = c(256, 256, 3, length(val_img_list)))
for (i in 1:length(val_img_list)) {
img <- readJPEG(val_img_list[[i]])
img_array[,,,i] <- preproc.image(img, width = 256, height = 256)
}
# Predict and decode
pred_out <- mxnet:::predict.MXFeedForwardModel(model = YOLO_model, X = img_array)
pred_box_info <- Decode_fun(pred_out, cut_prob = 0.5, cut_overlap = 0.3)
# Calculate IoU
pred_box_info$IoU <- 0
for (m in 1:nrow(pred_box_info)) {
sub_label_box_info <- val_box_info[val_box_info[,'img_id'] == pred_box_info[m,'img_id'], ]
IoUs <- numeric(nrow(sub_label_box_info))
for (n in 1:nrow(sub_label_box_info)) {
IoUs[n] <- IoU_function(label = sub_label_box_info[n,2:5], pred = pred_box_info[m,2:5])
}
pred_box_info[m,'IoU'] <- max(IoUs)
}
# Calculate AP
obj_IoU <- pred_box_info[,'IoU']
obj_prob <- pred_box_info[,'prob']
num_obj <- nrow(val_box_info)
AP_function(obj_IoU = obj_IoU, obj_prob = obj_prob, num_obj = num_obj, IoU_cut = 0.5)
## [1] 0.8955892
– 在剛剛的實驗中,我們是透過了加權正樣本10倍來粗略的解決這個問題,但似乎太過簡單暴力了。
如果你有學過Gradient Boosting Machine的話,你會將會想到可以在訓練過程中只挑較難的樣本進行學習,而假設已經學的很好了那就把它移除,而Ross Girshick也有在2016年使用這個方法:Online Hard Example Mining (OHEM),最終也取得還不錯的成果
但這個方法因為在每次訓練時找出較難分類的樣本作為下一次訓練的樣本,編寫程式上相當複雜(因為過程不可微)
– 因此,Kaiming He、Ross Girshick與他們Facebook的同事又合作提出了一個新的損失函數:Focal Loss,它的想法是Soft-OHEM,並基於這個方法訓練出了RetinaNet
\[CE(y, p, \alpha) = -\frac{{1}}{n}\sum \limits_{i=1}^{n} \left(\alpha \cdot y_{i} \cdot log(p_{i}) + (1 - \alpha) \cdot (1-y_{i}) \cdot log(1-p_{i})\right)\]
\[FL(y, p, \alpha, \gamma) = -\frac{{1}}{n}\sum \limits_{i=1}^{n} \left(\alpha \cdot (1 - p_{i})^{\gamma} \cdot y_{i} \cdot log(p_{i}) + (1 - \alpha) \cdot p_{i}^{\gamma} \cdot (1-y_{i}) \cdot log(1-p_{i})\right)\]
CE_loss_function <- function (indata, inlabel, obj, lambda, eps = 1e-4) {
log_pred_1 <- mx.symbol.log(data = indata + eps)
log_pred_2 <- mx.symbol.log(data = 1 - indata + eps)
multiple_log_pred_label_1 <- mx.symbol.broadcast_mul(lhs = log_pred_1, rhs = inlabel)
multiple_log_pred_label_2 <- mx.symbol.broadcast_mul(lhs = log_pred_2, rhs = 1 - inlabel)
obj_weighted_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = multiple_log_pred_label_1 + multiple_log_pred_label_2)
average_CE_loss <- mx.symbol.mean(data = obj_weighted_loss, axis = 0:3, keepdims = FALSE)
CE_loss <- 0 - average_CE_loss * lambda
return(CE_loss)
}
Focal_loss_function <- function (indata, inlabel, obj, lambda, gamma = 0, eps = 1e-4) {
log_pred_1 <- mx.symbol.log(data = indata + eps)
log_pred_2 <- mx.symbol.log(data = 1 - indata + eps)
multiple_log_pred_label_1 <- mx.symbol.broadcast_mul(lhs = log_pred_1, rhs = inlabel)
multiple_log_pred_label_2 <- mx.symbol.broadcast_mul(lhs = log_pred_2, rhs = 1 - inlabel)
obj_weighted_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = (1 - indata + eps)^gamma * multiple_log_pred_label_1 + (indata + eps)^gamma * multiple_log_pred_label_2)
average_Focal_loss <- mx.symbol.mean(data = obj_weighted_loss, axis = 0:3, keepdims = FALSE)
Focal_loss <- 0 - average_Focal_loss * lambda
return(Focal_loss)
}
\[MSE(y,\hat{y}) = \sum \limits_{i=1}^{n} (y - \hat{y})^2\]
– 但到了物件識別領域中又存在問題了,那就是若誤差太大時他給的損失值會以平方加權,但它所負責的部分是邊界框的長寬以及座標,偏移過多錯了就錯了,似乎不用給太大的損失。
\[MAE(y,\hat{y}) = \sum \limits_{i=1}^{n} |y - \hat{y}|\]
– 因此,我們又迫切的需要一個損失函數,滿足上述特性但具有連續可微的性質!
\[L(y,\hat{y}) = \sum \limits_{i=1}^{n} log(cosh(y - \hat{y}))\]
\[cosh(x) = \frac{e^x + e^{-x}}{2}\]
MSE_loss_function <- function (indata, inlabel, obj, lambda) {
diff_pred_label <- mx.symbol.broadcast_minus(lhs = indata, rhs = inlabel)
square_diff_pred_label <- mx.symbol.square(data = diff_pred_label)
obj_square_diff_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = square_diff_pred_label)
MSE_loss <- mx.symbol.mean(data = obj_square_diff_loss, axis = 0:3, keepdims = FALSE)
return(MSE_loss * lambda)
}
LOGCOSH_loss_function <- function (indata, inlabel, obj, lambda) {
diff_pred_label <- mx.symbol.broadcast_minus(lhs = indata, rhs = inlabel)
cosh_diff_pred_label <- mx.symbol.cosh(data = diff_pred_label)
logcosh_diff_pred_label <- mx.symbol.log(data = cosh_diff_pred_label)
obj_logcosh_diff_pred_label <- mx.symbol.broadcast_mul(lhs = obj, rhs = logcosh_diff_pred_label)
LOGCOSH_loss <- mx.symbol.mean(data = obj_logcosh_diff_pred_label, axis = 0:3, keepdims = FALSE)
return(LOGCOSH_loss * lambda)
}
– 這應該是一個很簡單的題目,你只需要修改odel Architecture的部分,而剩下的部分完全都不用動到就能執行了!
# Libraries
library(mxnet)
library(magrittr)
## Define the model architecture
## Use pre-trained model and fine tuning
# Load MobileNet v2
Pre_Trained_model <- mx.model.load('model/mobilev2', 0)
# Get the internal output
Mobile_symbol <- Pre_Trained_model$symbol
Mobile_All_layer <- Mobile_symbol$get.internals()
basic_out <- which(Mobile_All_layer$outputs == 'conv6_3_linear_bn_output') %>% Mobile_All_layer$get.output()
# mx.symbol.infer.shape(basic_out, data = c(256, 256, 3, 7))$out.shapes
# conv6_3_linear_bn_output out shape = 8 8 320 n (if input shape = 256 256 3 n)
# Convolution layer for specific mission and training new parameters
# 1. Additional some architecture for better learning
DWCONV_function <- function (indata, num_filters = 256, Inverse_coef = 6, residual = TRUE, name = 'lvl1', stage = 1) {
expend_conv <- mx.symbol.Convolution(data = indata, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = TRUE, num.filter = num_filters * Inverse_coef,
name = paste0(name, '_', stage, '_expend'))
expend_bn <- mx.symbol.BatchNorm(data = expend_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_expend_bn'))
expend_relu <- mx.symbol.LeakyReLU(data = expend_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_expend_relu'))
dwise_conv <- mx.symbol.Convolution(data = expend_relu, kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
no.bias = TRUE, num.filter = num_filters * Inverse_coef, num.group = num_filters * Inverse_coef,
name = paste0(name, '_', stage, '_dwise'))
dwise_bn <- mx.symbol.BatchNorm(data = dwise_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_dwise_bn'))
dwise_relu <- mx.symbol.LeakyReLU(data = dwise_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_dwise_relu'))
restore_conv <- mx.symbol.Convolution(data = dwise_relu, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = TRUE, num.filter = num_filters,
name = paste0(name, '_', stage, '_restore'))
restore_bn <- mx.symbol.BatchNorm(data = restore_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_restore_bn'))
if (residual) {
block <- mx.symbol.broadcast_plus(lhs = indata, rhs = restore_bn, name = paste0(name, '_', stage, '_block'))
return(block)
} else {
restore_relu <- mx.symbol.LeakyReLU(data = restore_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_restore_relu'))
return(restore_relu)
}
}
CONV_function <- function (indata, num_filters = 256, name = 'lvl1', stage = 1) {
conv <- mx.symbol.Convolution(data = indata, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = TRUE, num.filter = num_filters,
name = paste0(name, '_', stage, '_conv'))
bn <- mx.symbol.BatchNorm(data = conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_bn'))
relu <- mx.symbol.Activation(data = bn, act.type = 'relu', name = paste0(name, '_', stage, '_relu'))
return(relu)
}
YOLO_map_function <- function (indata, final_map = 6, num_box = 1, drop = 0.2, name = 'lvl1') {
dp <- mx.symbol.Dropout(data = indata, p = drop, name = paste0(name, '_drop'))
conv <- mx.symbol.Convolution(data = dp, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
no.bias = FALSE, num.filter = final_map, name = paste0(name, '_linearmap'))
inter_split <- mx.symbol.SliceChannel(data = conv, num_outputs = final_map,
axis = 1, squeeze_axis = FALSE, name = paste0(name, "_inter_split"))
new_list <- list()
for (k in 1:final_map) {
if (!(k %% num_box) %in% c(4:5)) {
new_list[[k]] <- mx.symbol.Activation(inter_split[[k]], act.type = 'sigmoid', name = paste0(name, "_yolomap_", k))
} else {
new_list[[k]] <- inter_split[[k]]
}
}
yolomap <- mx.symbol.concat(data = new_list, num.args = final_map, dim = 1, name = paste0(name, "_yolomap"))
return(yolomap)
}
yolo_conv_1 <- DWCONV_function(indata = basic_out, num_filters = 320, Inverse_coef = 3, residual = TRUE, name = 'yolo', stage = 1)
yolo_conv_2 <- DWCONV_function(indata = yolo_conv_1, num_filters = 320, Inverse_coef = 3, residual = TRUE, name = 'yolo', stage = 2)
yolo_conv_3 <- CONV_function(indata = yolo_conv_2, num_filters = 320, name = 'yolo', stage = 3)
yolomap <- YOLO_map_function(indata = yolo_conv_3, final_map = 6, drop = 0.2, name = 'final')
# 2. Custom loss function
LOGCOSH_loss_function <- function (indata, inlabel, obj, lambda) {
diff_pred_label <- mx.symbol.broadcast_minus(lhs = indata, rhs = inlabel)
cosh_diff_pred_label <- mx.symbol.cosh(data = diff_pred_label)
logcosh_diff_pred_label <- mx.symbol.log(data = cosh_diff_pred_label)
obj_logcosh_diff_pred_label <- mx.symbol.broadcast_mul(lhs = obj, rhs = logcosh_diff_pred_label)
LOGCOSH_loss <- mx.symbol.mean(data = obj_logcosh_diff_pred_label, axis = 0:3, keepdims = FALSE)
return(LOGCOSH_loss * lambda)
}
Focal_loss_function <- function (indata, inlabel, obj, lambda, gamma = 0, eps = 1e-4) {
log_pred_1 <- mx.symbol.log(data = indata + eps)
log_pred_2 <- mx.symbol.log(data = 1 - indata + eps)
multiple_log_pred_label_1 <- mx.symbol.broadcast_mul(lhs = log_pred_1, rhs = inlabel)
multiple_log_pred_label_2 <- mx.symbol.broadcast_mul(lhs = log_pred_2, rhs = 1 - inlabel)
obj_weighted_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = (1 - indata + eps)^gamma * multiple_log_pred_label_1 + (indata + eps)^gamma * multiple_log_pred_label_2)
average_Focal_loss <- mx.symbol.mean(data = obj_weighted_loss, axis = 0:3, keepdims = FALSE)
Focal_loss <- 0 - average_Focal_loss * lambda
return(Focal_loss)
}
YOLO_loss_function <- function (indata, inlabel, final_map = 6, num_box = 1, lambda = 10, gamma = 2, weight_classification = 0.2, name = 'yolo') {
num_feature <- final_map/num_box
my_loss <- 0
yolomap_split <- mx.symbol.SliceChannel(data = indata, num_outputs = final_map,
axis = 1, squeeze_axis = FALSE, name = paste(name, '_yolomap_split'))
label_split <- mx.symbol.SliceChannel(data = inlabel, num_outputs = final_map,
axis = 1, squeeze_axis = FALSE, name = paste(name, '_label_split'))
for (j in 1:num_box) {
for (k in 1:num_feature) {
if (k %in% 1:5) {weight <- 1} else {weight <- weight_classification}
if (!k %in% c(2:5)) {
if (k == 1) {
my_loss <- my_loss + Focal_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = label_split[[(j-1)*num_feature+1]],
lambda = lambda * weight,
gamma = gamma,
eps = 1e-4)
my_loss <- my_loss + Focal_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = 1 - label_split[[(j-1)*num_feature+1]],
lambda = 1,
gamma = gamma,
eps = 1e-4)
} else {
my_loss <- my_loss + Focal_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = label_split[[(j-1)*num_feature+1]],
lambda = lambda * weight,
gamma = gamma,
eps = 1e-4)
}
} else {
my_loss <- my_loss + LOGCOSH_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
inlabel = label_split[[(j-1)*num_feature+k]],
obj = label_split[[(j-1)*num_feature+1]],
lambda = lambda * weight)
}
}
}
return(my_loss)
}
label <- mx.symbol.Variable(name = "label")
yolo_loss <- YOLO_loss_function(indata = yolomap, inlabel = label, final_map = 6, num_box = 1, lambda = 10, gamma = 2, weight_classification = 0.2, name = 'yolo')
final_yolo_loss <- mx.symbol.MakeLoss(data = yolo_loss)
– 這個原因是我們今天訓練的是一個YOLO v1模型,而錨框(anchor box)的使用是從YOLO v2的模型開始的。
– 錨框的想法是非常重要的,否則我們很難在YOLO v1的基礎上增加每個grid所預測的bounding box數量。
– 錨框的長寬位置仍然是非常重要的,舉例來說我們要做道路上的人車識別,很有可能行人的長寬比大多都是3比1,而汽車的長寬比大多都是1比5,因此我們就可以運用不同的錨框來預測相似屬性的物件
– 引入錨框之後,Encode與Decode的過程變成了計算與錨框之間的差異:
– Joseph Redmon在YOLO v2的論文中:YOLO9000: Better, Faster, Stronger提出了另一種決定錨框的長寬比的思路,那就是把這些框的長寬比做聚類分析(clustering analysis),之後再決定出數個錨框。
# box_info_path (Training and Validation set)
original_box_info_path <- 'data/train_box_info.RData'
revised_box_info_path <- 'data/train_box_info (yolo v3).RData'
anchor_boxs_path <- 'data/anchor_boxs (yolo v3).RData'
# Start to define anchor boxes
load(original_box_info_path)
anchor_box_info <- data.frame(width = log(train_box_info[,3] - train_box_info[,2]),
height = log(train_box_info[,4] - train_box_info[,5]),
stringsAsFactors = FALSE)
kmean_model <- kmeans(x = anchor_box_info, centers = 9, iter.max = 10)
anchor_boxs <- as.data.frame(kmean_model$centers, stringsAsFactors = FALSE)
anchor_boxs$width <- exp(anchor_boxs$width)
anchor_boxs$height <- exp(anchor_boxs$height)
anchor_boxs$rank <- rank(anchor_boxs[,1] * anchor_boxs[,2])
anchor_boxs$lvl <- ceiling(anchor_boxs$rank / 3)
anchor_boxs$seq <- anchor_boxs$rank %% 3 + 1
anchor_boxs$col <- rainbow(9)[anchor_boxs$rank]
anchor_boxs
## width height rank lvl seq col
## 1 0.10738119 0.1260670 5 2 3 #00FFAAFF
## 2 0.07754509 0.1306079 2 1 3 #FFAA00FF
## 3 0.08676591 0.1370560 4 2 2 #00FF00FF
## 4 0.08017669 0.1209479 1 1 2 #FF0000FF
## 5 0.13684827 0.1713482 9 3 1 #FF00AAFF
## 6 0.11698890 0.1433591 7 3 2 #0000FFFF
## 7 0.09489442 0.1504152 6 2 1 #00AAFFFF
## 8 0.09162414 0.1277321 3 1 1 #AAFF00FF
## 9 0.10745922 0.1703601 8 3 3 #AA00FFFF
# Visualization
par(mar = c(5, 4, 4, 2))
plot(exp(anchor_box_info$width), exp(anchor_box_info$height), pch = 19, cex = 1.5,
col = anchor_boxs$col[kmean_model$cluster],
xlab = 'Width', ylab = 'Height', main = 'Anchor box clusters')
# Add anchor box info to train_box_info
train_box_info$bbox_center_row <- (train_box_info[,4] + train_box_info[,5])/2
train_box_info$bbox_center_col <- (train_box_info[,2] + train_box_info[,3])/2
train_box_info$bbox_width <- exp(anchor_box_info$width)
train_box_info$bbox_height <- exp(anchor_box_info$height)
train_box_info$anchor_width <- anchor_boxs$width[kmean_model$cluster]
train_box_info$anchor_height <- anchor_boxs$height[kmean_model$cluster]
train_box_info$rank <- anchor_boxs$rank[kmean_model$cluster]
train_box_info$lvl <- anchor_boxs$lvl[kmean_model$cluster]
train_box_info$seq <- anchor_boxs$seq[kmean_model$cluster]
head(train_box_info)
## obj_name col_left col_right row_bot row_top prob img_id
## 1 pikachu 0.6267570 0.7256063 0.4658268 0.3013253 1 1
## 2 pikachu 0.5070340 0.5993253 0.4963081 0.3682864 1 2
## 3 pikachu 0.5904536 0.6917713 0.5608004 0.3917792 1 3
## 4 pikachu 0.5722729 0.6571676 0.5396996 0.4144326 1 4
## 5 pikachu 0.3893552 0.5016431 0.4850163 0.3470082 1 5
## 6 pikachu 0.3819232 0.4916472 0.5595707 0.4213461 1 6
## bbox_center_row bbox_center_col bbox_width bbox_height anchor_width
## 1 0.3835761 0.6761816 0.09884930 0.1645015 0.10745922
## 2 0.4322973 0.5531797 0.09229130 0.1280217 0.09162414
## 3 0.4762898 0.6411124 0.10131770 0.1690212 0.10745922
## 4 0.4770661 0.6147203 0.08489472 0.1252670 0.08017669
## 5 0.4160123 0.4454992 0.11228791 0.1380081 0.11698890
## 6 0.4904584 0.4367852 0.10972404 0.1382246 0.11698890
## anchor_height rank lvl seq
## 1 0.1703601 8 3 3
## 2 0.1277321 3 1 1
## 3 0.1703601 8 3 3
## 4 0.1209479 1 1 2
## 5 0.1433591 7 3 2
## 6 0.1433591 7 3 2
# Save data
save(train_box_info, file = revised_box_info_path)
anchor_boxs <- anchor_boxs[order(anchor_boxs$rank),]
rownames(anchor_boxs) <- 1:nrow(anchor_boxs)
save(anchor_boxs, file = anchor_boxs_path)
Encode_fun <- function (box_info, n.grid = c(32, 16, 8), eps = 1e-8, n.anchor = 3,
obj_name = 'pikachu') {
img_IDs <- unique(box_info$img_id)
num_pred <- 5 + length(obj_name)
out_array_list <- list()
for (k in 1:length(n.grid)) {
out_array_list[[k]] <- array(0, dim = c(n.grid[k], n.grid[k], n.anchor * num_pred, length(img_IDs)))
}
for (j in 1:length(img_IDs)) {
sub_box_info <- box_info[box_info$img_id == img_IDs[j],]
for (k in 1:length(n.grid)) {
if (k %in% sub_box_info$lvl) {
rescale_box_info <- sub_box_info[sub_box_info$lvl == k,c(1, 8:13, 15:16)]
rescale_box_info[,2:7] <- rescale_box_info[,2:7] * n.grid[k]
for (i in 1:nrow(rescale_box_info)) {
center_row <- ceiling(rescale_box_info$bbox_center_row[i])
center_col <- ceiling(rescale_box_info$bbox_center_col[i])
row_related_pos <- rescale_box_info$bbox_center_row[i] %% 1
row_related_pos[row_related_pos == 0] <- 1
col_related_pos <- rescale_box_info$bbox_center_col[i] %% 1
col_related_pos[col_related_pos == 0] <- 1
out_array_list[[k]][center_row,center_col,(rescale_box_info$seq[i]-1)*num_pred+1,j] <- 1
out_array_list[[k]][center_row,center_col,(rescale_box_info$seq[i]-1)*num_pred+2,j] <- row_related_pos
out_array_list[[k]][center_row,center_col,(rescale_box_info$seq[i]-1)*num_pred+3,j] <- col_related_pos
out_array_list[[k]][center_row,center_col,(rescale_box_info$seq[i]-1)*num_pred+4,j] <- log(rescale_box_info$bbox_width[i]/rescale_box_info$anchor_width[i] + eps)
out_array_list[[k]][center_row,center_col,(rescale_box_info$seq[i]-1)*num_pred+5,j] <- log(rescale_box_info$bbox_height[i]/rescale_box_info$anchor_height[i] + eps)
out_array_list[[k]][center_row,center_col,(rescale_box_info$seq[i]-1)*num_pred+5+which(obj_name %in% rescale_box_info$obj_name[i]),j] <- 1
}
}
}
}
return(out_array_list)
}
Decode_fun <- function (encode_array_list, anchor_boxs,
cut_prob = 0.5, cut_overlap = 0.5,
obj_name = 'pikachu',
obj_col = '#FF0000FF') {
num_list <- length(encode_array_list)
num_img <- dim(encode_array_list[[1]])[4]
num_feature <- length(obj_name) + 5
pos_start <- (0:(dim(encode_array_list[[1]])[3]/num_feature-1)*num_feature)
box_info <- NULL
# Decoding
for (j in 1:num_img) {
sub_box_info <- NULL
for (k in 1:num_list) {
for (i in 1:length(pos_start)) {
sub_encode_array <- as.array(encode_array_list[[k]])[,,pos_start[i]+1:num_feature,j]
pos_over_cut <- which(sub_encode_array[,,1] >= cut_prob)
if (length(pos_over_cut) >= 1) {
pos_over_cut_row <- pos_over_cut %% dim(sub_encode_array)[1]
pos_over_cut_row[pos_over_cut_row == 0] <- dim(sub_encode_array)[1]
pos_over_cut_col <- ceiling(pos_over_cut/dim(sub_encode_array)[1])
anchor_box <- anchor_boxs[anchor_boxs$lvl == k & anchor_boxs$seq == i, 1:2]
for (l in 1:length(pos_over_cut)) {
encode_vec <- sub_encode_array[pos_over_cut_row[l],pos_over_cut_col[l],]
if (encode_vec[2] < 0) {encode_vec[2] <- 0}
if (encode_vec[2] > 1) {encode_vec[2] <- 1}
if (encode_vec[3] < 0) {encode_vec[3] <- 0}
if (encode_vec[3] > 1) {encode_vec[3] <- 1}
center_row <- (encode_vec[2] + (pos_over_cut_row[l] - 1))/dim(sub_encode_array)[1]
center_col <- (encode_vec[3] + (pos_over_cut_col[l] - 1))/dim(sub_encode_array)[2]
width <- exp(encode_vec[4]) * anchor_box[1,1]
height <- exp(encode_vec[5]) * anchor_box[1,2]
new_box_info <- data.frame(obj_name = obj_name[which.max(encode_vec[-c(1:5)])],
col_left = center_col-width/2,
col_right = center_col+width/2,
row_bot = center_row+height/2,
row_top = center_row-height/2,
prob = encode_vec[1],
img_ID = j,
col = obj_col[which.max(encode_vec[-c(1:5)])],
stringsAsFactors = FALSE)
sub_box_info <- rbind(sub_box_info, new_box_info)
}
}
}
}
if (!is.null(sub_box_info)) {
# Remove overlapping
sub_box_info <- sub_box_info[order(sub_box_info$prob, decreasing = TRUE),]
for (obj in unique(sub_box_info$obj_name)) {
obj_sub_box_info <- sub_box_info[sub_box_info$obj_name == obj,]
if (nrow(obj_sub_box_info) == 1) {
box_info <- rbind(box_info, obj_sub_box_info)
} else {
overlap_seq <- NULL
for (m in 2:nrow(obj_sub_box_info)) {
for (n in 1:(m-1)) {
if (!n %in% overlap_seq) {
overlap_prob <- IoU_function(label = obj_sub_box_info[m,2:5], pred = obj_sub_box_info[n,2:5])
overlap_width <- min(obj_sub_box_info[m,3], obj_sub_box_info[n,3]) - max(obj_sub_box_info[m,2], obj_sub_box_info[n,2])
overlap_height <- min(obj_sub_box_info[m,4], obj_sub_box_info[n,4]) - max(obj_sub_box_info[m,5], obj_sub_box_info[n,5])
if (overlap_prob >= cut_overlap) {
overlap_seq <- c(overlap_seq, m)
}
}
}
}
if (!is.null(overlap_seq)) {
obj_sub_box_info <- obj_sub_box_info[-overlap_seq,]
}
box_info <- rbind(box_info, obj_sub_box_info)
}
}
}
}
return(box_info)
}
# Load data (Training set)
load('data/train_img_list.RData')
load('data/train_box_info (yolo v3).RData')
load('data/anchor_boxs (yolo v3).RData')
# Test Encode & Decode function
img_id <- 1
resized_img <- readJPEG(train_img_list[[img_id]])
sub_BOX_INFOS <- train_box_info[train_box_info$img_id %in% img_id,]
Encode_label <- Encode_fun(box_info = sub_BOX_INFOS)
restore_BOX_INFOS <- Decode_fun(encode_array_list = Encode_label, anchor_boxs = anchor_boxs)
Show_img(img = resized_img, box_info = restore_BOX_INFOS, show_grid = FALSE)
– 至於接著就是要進行模型的訓練及預測,這個部分就請你直接下載現成的語法:pikachu object detection (multi boxes).R以及predict (multi boxes).R
– 上過這節課之後,你再回頭看看上一節課的家庭作業,你是不是覺得更清楚它的運作方式了?
– 由於有太多超參數需要調整,你可能會需要訓練很多次,最終把你的研究過程寫成一個簡短的報告與同學分享。
– 如果你真的想要訓練一個物件識別模型,一定要把Github上的範例:MxNetR-YOLO中VOC2007的部分做過一次,並試著看能不能將整個流程套用到你想要的地方。
– 你可以把上節課作業的模型當作是你的預訓練模型用到自己的任務上,由於結構更為相似他轉移特徵學習的效果應該會更好!